/*
// MULTOPP5 - by Paul Hsieh (c) 1997, 1998, All rights reserved.
//
// This program is public domain, subject to the conditions that any use made
// of it that results in a derivative work or product must credit the author,
// Paul Hsieh and that this source never be distributed without these comments
// appearing intact at the top of the program.
//
// MULTOPP5 searches for optimal sequences of adds, subs, shls, leas, to
// simulate a constant multiply on a Pentium.  It is assumed that no more
// than two registers are used (likely a somewhat faulty, but necessary
// assumption) and that the intermediate values are bounded by limits of
// 0..n, where n is the maximum multiply factor simulated.  The results tend
// to justify the assumptions.  It is easy to show that using more than 2
// registers only affects sequences of at least 5 instructions/3 clocks; that
// is to say no sequences given here in 3 clocks or less can be improved for
// total clock count.  For numbers up to 2048, there is never a need to go
// beyond 6 clocks; nearly 90% of them running in 5 clocks or fewer.
//
// The ordering used is a lexical one:
//
//  (number of clocks, whether or not the V pipe is free, total size)
//
// The first two optimize clock usage, taking into account that its potential
// use inside other code is best served by additional pairing after it has
// completed.  There is no heuristic for waiting before writing to the ebx
// register.  The code size optimization is good because otherwise the code
// has a tendency to add in extra meaningless instructions that does not
// affect the total clock count, but which is not how any reasonable human 
// would code.
//
// Terje Mathisen made the good point that assuming eax is not ready for 
// address generation on the first clock would also be a good thing to add.  I 
// have not looked into this.
//
// The algorithm is a departure from my original one which used an "OpenList"
// and simply grew it incrementally as required.  The present algorithm is
// simply a depth first search through the code space, pruning code sequences
// that are less efficient than code sequences already encountered.  The
// search is iteratively deepened by maximum (clock,V pipe state) allowed by
// minimal granular steps so that nothing is missed along the search, and so
// the search path can find suboptimal paths before wasting unpredictable
// resources on them.  A table of "best encounters so far" is maintained.
// The size of this table, so far, appears to be the biggest limiting factor 
// as I can produce results up to 2048 (up from 512; my 1993 result) easily, 
// but 4096 is completely undoable with 32 MB of RAM.  Terje Mathisen was able 
// to reach 4096 on a 64MB equipped DEC alpha, and indeed pushed it up to 
// 5040. However, searches up to non-power of 2s are suspect since the 
// corresponding left shift over that number is not present in the search 
// space of partial results.
//
// Besides leading to optimal p5 multiplies (something of diminishing value
// as the x86 series gets faster in subsequent iterations) I am hoping this
// example of "finding optimal code sequences" germinates discussion on
// optimal code generation in general.
//
// Thanks to Jeremy McDonald, Robert Prins and Bjorn Stromvall for pointing 
// out a couple obvious flaws with the first version of this program (some of 
// the instructions I was using had obvious faster or shorter equivalents.)
*/

#include <stdio.h>

typedef unsigned char byte;
typedef struct {
        int CodeLength;
        int Weight[4];
        byte CodeFrag[8];
        byte cost;
        byte p5flags;
        char * Name;
} StepType;

/* Pentium pairing bitflags */

#define UV 0xC0
#define PU 0x80
#define PV 0x40
#define NP 0x00
#define AB 0x03
#define Ax 0x02
#define xB 0x01
#define xx 0x00

/* ... including potential AGI stalls */

#define ab (0x0C|AB)
#define ax (0x08|Ax)
#define xb (0x04|xB)

#define MBS 2

#define P5Attr(pairing,readreg,writereg) ((pairing)|((readreg)<<MBS)|(writereg))

/*
    The instruction characteristics.  The Weight[] vector is really a 2x2 
    matrix that and (EAX,EBX) vector is multiplied by to calculate the effect 
    of the instruction on these registers.

    In another version of this program I actually used the code fragments to
    compile up the resulting subroutines to test their correctness.

    The cost instruction field was to be used for encompassing multi-clock
    instructions (I wrote the original version before Pentium's ever existed.)
    Of course none of these instructions are multi-clock.  Hmmm ... now that I
    think of is, technically, I could put 'xchg eax,ebx' in here, but I highly
    doubt it would ever be used.

    The p5flags field encode the pairability of the instruction, the register
    read requirements and the register write requirements.

    The name field just gives what the instruction mneumonic is, so that the
    program can output results in a readable form.
*/

StepType Step[] = {
{2, {1, 0, 1, 0}, { 0x89, 0xc3 }        , 1, P5Attr(UV,Ax,xB), " mov     ebx,eax         " },
{2, {0, 1, 0, 1}, { 0x89, 0xd8 }        , 1, P5Attr(UV,xB,Ax), " mov     eax,ebx         " },
{2, {0, 0, 0, 1}, { 0x31, 0xc0 }        , 1, P5Attr(UV,Ax,Ax), " xor     eax,eax         " },
{2, {1, 1, 0, 1}, { 0x01, 0xd8 }        , 1, P5Attr(UV,AB,Ax), " add     eax,ebx         " },
{2, {1, 0, 1, 1}, { 0x01, 0xc3 }        , 1, P5Attr(UV,AB,xB), " add     ebx,eax         " },
{2, {1,-1, 0, 1}, { 0x29, 0xd8 }        , 1, P5Attr(UV,AB,Ax), " sub     eax,ebx         " },
{2, {1, 0,-1, 1}, { 0x29, 0xc3 }        , 1, P5Attr(UV,AB,xB), " sub     ebx,eax         " },
{2, {2, 0, 0, 1}, { 0x01, 0xc0 }        , 1, P5Attr(UV,Ax,Ax), " add     eax,eax         " },
{2, {1, 0, 0, 2}, { 0x01, 0xdb }        , 1, P5Attr(UV,xB,xB), " add     ebx,ebx         " },
{3, {  4, 0, 0, 1}, { 0xc1, 0xe0, 0x02 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,02H         " },
{3, {  8, 0, 0, 1}, { 0xc1, 0xe0, 0x03 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,03H         " },
{3, { 16, 0, 0, 1}, { 0xc1, 0xe0, 0x04 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,04H         " },
{3, { 32, 0, 0, 1}, { 0xc1, 0xe0, 0x05 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,05H         " },
{3, { 64, 0, 0, 1}, { 0xc1, 0xe0, 0x06 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,06H         " },
{3, {128, 0, 0, 1}, { 0xc1, 0xe0, 0x07 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,07H         " },
{3, {256, 0, 0, 1}, { 0xc1, 0xe0, 0x08 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,08H         " },
{3, {512, 0, 0, 1}, { 0xc1, 0xe0, 0x09 }, 1, P5Attr(PU,Ax,Ax), " shl     eax,09H         " },
{3,{1024, 0, 0, 1}, { 0xc1, 0xe0, 0x0A }, 1, P5Attr(PU,Ax,Ax), " shl     eax,0aH         " },
{3,{2048, 0, 0, 1}, { 0xc1, 0xe0, 0x0B }, 1, P5Attr(PU,Ax,Ax), " shl     eax,0bH         " },
{3, {3, 0, 0, 1}, { 0x8d, 0x04, 0x40 }  , 1, P5Attr(UV,ax,Ax), " lea     eax,[eax+eax*2] " },
{3, {5, 0, 0, 1}, { 0x8d, 0x04, 0x80 }  , 1, P5Attr(UV,ax,Ax), " lea     eax,[eax+eax*4] " },
{3, {9, 0, 0, 1}, { 0x8d, 0x04, 0xc0 }  , 1, P5Attr(UV,ax,Ax), " lea     eax,[eax+eax*8] " },
{3, {1, 0, 0,   4}, { 0xc1, 0xe3, 0x02 }, 1, P5Attr(PU,xB,xB), " shl     ebx,02H         " },
{3, {1, 0, 0,   8}, { 0xc1, 0xe3, 0x03 }, 1, P5Attr(PU,xB,xB), " shl     ebx,03H         " },
{3, {1, 0, 0,  16}, { 0xc1, 0xe3, 0x04 }, 1, P5Attr(PU,xB,xB), " shl     ebx,04H         " },
{3, {1, 0, 0,  32}, { 0xc1, 0xe3, 0x05 }, 1, P5Attr(PU,xB,xB), " shl     ebx,05H         " },
{3, {1, 0, 0,  64}, { 0xc1, 0xe3, 0x06 }, 1, P5Attr(PU,xB,xB), " shl     ebx,06H         " },
{3, {1, 0, 0, 128}, { 0xc1, 0xe3, 0x07 }, 1, P5Attr(PU,xB,xB), " shl     ebx,07H         " },
{3, {1, 0, 0, 256}, { 0xc1, 0xe3, 0x08 }, 1, P5Attr(PU,xB,xB), " shl     ebx,08H         " },
{3, {1, 0, 0, 512}, { 0xc1, 0xe3, 0x09 }, 1, P5Attr(PU,xB,xB), " shl     ebx,09H         " },
{3, {1, 0, 0,1024}, { 0xc1, 0xe3, 0x0a }, 1, P5Attr(PU,xB,xB), " shl     ebx,0aH         " },
{3, {1, 0, 0,2048}, { 0xc1, 0xe3, 0x0b }, 1, P5Attr(PU,xB,xB), " shl     ebx,0bH         " },
{3, {1, 0, 0, 3}, { 0x8d, 0x1c, 0x5b }  , 1, P5Attr(UV,xb,xB), " lea     ebx,[ebx+ebx*2] " },
{3, {1, 0, 0, 5}, { 0x8d, 0x1c, 0x9b }  , 1, P5Attr(UV,xb,xB), " lea     ebx,[ebx+ebx*4] " },
{3, {1, 0, 0, 9}, { 0x8d, 0x1c, 0xdb }  , 1, P5Attr(UV,xb,xB), " lea     ebx,[ebx+ebx*8] " },
{3, {1, 2, 0, 1}, { 0x8d, 0x04, 0x58 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[eax+ebx*2] " },
{3, {1, 4, 0, 1}, { 0x8d, 0x04, 0x98 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[eax+ebx*4] " },
{3, {1, 8, 0, 1}, { 0x8d, 0x04, 0xd8 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[eax+ebx*8] " },
{3, {1, 0, 1, 2}, { 0x8d, 0x1c, 0x58 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[eax+ebx*2] " },
{3, {1, 0, 1, 4}, { 0x8d, 0x1c, 0x98 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[eax+ebx*4] " },
{3, {1, 0, 1, 8}, { 0x8d, 0x1c, 0xd8 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[eax+ebx*8] " },
{3, {2, 1, 0, 1}, { 0x8d, 0x04, 0x43 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[ebx+eax*2] " },
{3, {4, 1, 0, 1}, { 0x8d, 0x04, 0x83 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[ebx+eax*4] " },
{3, {8, 1, 0, 1}, { 0x8d, 0x04, 0xc3 }  , 1, P5Attr(UV,ab,Ax), " lea     eax,[ebx+eax*8] " },
{3, {1, 0, 2, 1}, { 0x8d, 0x1c, 0x43 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[ebx+eax*2] " },
{3, {1, 0, 4, 1}, { 0x8d, 0x1c, 0x83 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[ebx+eax*4] " },
{3, {1, 0, 8, 1}, { 0x8d, 0x1c, 0xc3 }  , 1, P5Attr(UV,ab,xB), " lea     ebx,[ebx+eax*8] " },
{3, {0, 2, 0, 1}, { 0x8d, 0x04, 0x1b }  , 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx+ebx]   " },
{3, {0, 3, 0, 1}, { 0x8d, 0x04, 0x5b }  , 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx+ebx*2] " },
{3, {0, 5, 0, 1}, { 0x8d, 0x04, 0x9b }  , 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx+ebx*4] " },
{3, {0, 9, 0, 1}, { 0x8d, 0x04, 0xdb }  , 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx+ebx*8] " },
{3, {1, 0, 2, 0}, { 0x8d, 0x1c, 0x00 }  , 1, P5Attr(UV,ax,xB), " lea     ebx,[eax+eax]   " },
{3, {1, 0, 3, 0}, { 0x8d, 0x1c, 0x40 }  , 1, P5Attr(UV,ax,xB), " lea     ebx,[eax+eax*2] " },
{3, {1, 0, 5, 0}, { 0x8d, 0x1c, 0x80 }  , 1, P5Attr(UV,ax,xB), " lea     ebx,[eax+eax*4] " },
{3, {1, 0, 9, 0}, { 0x8d, 0x1c, 0xc0 }  , 1, P5Attr(UV,ax,xB), " lea     ebx,[eax+eax*8] " },
{7, {0, 4, 0, 1}, { 0x8d, 0x04, 0x9d, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx*4+0H] " },
{7, {0, 8, 0, 1}, { 0x8d, 0x04, 0xdd, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,Ax), " lea     eax,[ebx*8+0H] " },
{7, {1, 0, 4, 0}, { 0x8d, 0x1c, 0x85, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,xB), " lea     ebx,[eax*4+0H] " },
{7, {1, 0, 8, 0}, { 0x8d, 0x1c, 0xc5, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,xB), " lea     ebx,[eax*8+0H] " },
{7, {4, 0, 0, 1}, { 0x8d, 0x04, 0x85, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,Ax), " lea     eax,[eax*4+0H] " },
{7, {8, 0, 0, 1}, { 0x8d, 0x04, 0xc5, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,Ax), " lea     eax,[eax*8+0H] " },
{7, {1, 0, 0, 4}, { 0x8d, 0x1c, 0x9d, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,xB), " lea     ebx,[ebx*4+0H] " },
{7, {1, 0, 0, 8}, { 0x8d, 0x1c, 0xdd, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,xB), " lea     ebx,[ebx*8+0H] " },
{0, {1, 0, 0, 1}, { 0x90, 0x90, 0x90 }   } , -1, P5Attr(NP,AB,AB), " Sentinel                " };

/*
    Declaring ClockCountType as a float allows me to put a hack for the
    total code length (otherwise a lot of silly results show up, where
    dead clocks are filled with irrelevant instructions.)  However, as the
    whole algorithm is limited by the "SpaceCost" array size, it can make
    sense to make this a short, or indeed a char.  I have only experimented
    a little here.
*/

typedef float ClockCountType;

/*
    The pipe state structure is what I use to model the pentium's internal
    resource contentions.  It basically keeps track of the clocks consumed
    so far as well as usage bits for RAW (read after write) and WAW (write
    after write) dependencies as well as potential AGI contentions from the 
    previous clock.
*/

typedef struct {
        ClockCountType clocks;
	unsigned char freepipe;
	unsigned char contentionmask;
} PipeStateStructType;

#define UFREE PU
#define VFREE PV

void InitPSS(PipeStateStructType * pss) {
    pss->clocks=0;
    pss->freepipe = UFREE;
    pss->contentionmask = 0;
}

/*
    Model Pentium scheduling as best as I know it, to count clocks.  Also
    adds in tiny fractional fudge clocks for total code length.
*/

PipeStateStructType * AddInstruction(PipeStateStructType * pss, int inst) {
static PipeStateStructType new_pss;
char imask;
ClockCountType iclocks;
int RAW,AGI,i,j;

    imask = Step[inst].p5flags;
    iclocks = Step[inst].cost;

    /* Contentions on current instruction, work out first clock */

    new_pss = *pss;

    /* Is there a RAW, WAW or AGI contention with the previous state? */

    j = ((((imask&AB)<<MBS)|imask) & pss->contentionmask)&(~(UV|AB)); /* ?AW */
    i = (imask & pss->contentionmask)&(~(UV|AB));                     /* RAW */

    /* Is there an AGI stall? */

    if( (i & (AB<<(2*(MBS))))!=0 ) {    /* RAW on address? => AGI stall */
        new_pss.clocks+=1;
        i &= ~(AB<<(2*(MBS)));          /* Doesn't affect issue pairing */
    }

    /* Does this instruction pair?  And if so is it a free clock or not? */

    if( (0!=j) || ((imask & (pss->freepipe))==0) ) { 	/* WAW/RAW upariable */
        new_pss.clocks+=1;
	new_pss.freepipe = UFREE;
	if( imask & PU ) new_pss.freepipe = VFREE;
    } else {
	new_pss.freepipe ^= (UFREE ^ VFREE);
        if( new_pss.freepipe==VFREE ) new_pss.clocks+=1;
    } 

    /* The potential extra clocks on this instruction */

    new_pss.clocks+=(iclocks-1);

    /* Work out end of clock conditions; Available for next pair?  Extra 
       shadowed delay? */

    RAW=0;                              /* No RAW on next inst? */
    if( new_pss.clocks!=pss->clocks ) 
	RAW=imask & AB;  		/* V pipe dependencies on next inst */

    AGI = 0;                            /* No stall after multiple clocks? */
    if( new_pss.clocks==pss->clocks )
        AGI = ((pss->contentionmask)>>MBS);     /* AGI on previous writes */
    else if( iclocks<2 ) {
        AGI = (pss->contentionmask)>>(2*(MBS)); /* AGI from pairing */
    }
    AGI = (AGI|imask) & AB;             /* AGI on current writes as well */

    /* These are the contention flags */

    new_pss.contentionmask = ((AGI<<MBS)|RAW)<<MBS;

    /* Instruction length fudge factor (smaller code is better) */

    new_pss.clocks += (Step[inst].CodeLength)/1024.0;

    return &new_pss;
}

/* Note that the limit has been set to 512 so that it can run on even modest
   PCs.  Change the number to a higher value to get more results (though you
   will need a quadratically increased amount of memory for the program to
   complete in a reasonable amount of time.) */

#define Limit 2048
#define ValLimit (Limit+2)
#define Invalid 0x5AAA
#define MAXCLOCKS 255
#define MaxPathLength 12

int Path[MaxPathLength];   /* Yeah, like we'll ever be able to do 12 ... */

unsigned char Done[Limit+2];
ClockCountType Best[Limit+2];
ClockCountType SpaceCost[Limit+1][Limit+2];
unsigned int Reported = 0;

int FragmentLength[Limit+2];
unsigned char Fragment[Limit+2][MaxPathLength];

void InitSpaceState() {
int i,j;
    for(i=0;i<=Limit;i++) {
        for(j=0;j<=Limit+1;j++) {
            SpaceCost[i][j] = MAXCLOCKS;/* Unrealistic maximum total clocks */
        }
    }
    for(i=0;i<=Limit+1;i++) {
        Done[i] = 0;                    /* Not yet output */
        Best[i] = MAXCLOCKS;
    }
}

/* Do single component matrix multiply */

short WeightedSum( short A, short B, short Aw, short Bw ) {
    if( ((Aw!=0) && A ==Invalid) ) return(Invalid);
    if( (( A!=0) && Aw==Invalid) ) return(Invalid);
    if( ((Bw!=0) && B ==Invalid) ) return(Invalid);
    if( (( B!=0) && Bw==Invalid) ) return(Invalid);
    return( A*Aw+B*Bw );
}

/* Do vector x matrix multiply and track "invalid" assignments */

int ApplyInst( short *A, short *B, int I ) {
int a,b;

    a = WeightedSum( *A, *B, Step[I].Weight[0], Step[I].Weight[1] );
    b = WeightedSum( *A, *B, Step[I].Weight[2], Step[I].Weight[3] );

    if( a<0 || a>Limit ) a = Invalid;
    if( b<0 || b>Limit ) b = Invalid;

    /* No sense in turning a good value to a bad value */

    if( (a==Invalid) && ((*A)!=Invalid) ) return 0;
    if( (b==Invalid) && ((*B)!=Invalid) ) return 0;

    *A = a;
    *B = b;

    return 1;
}

void GiveResult(int i) {
int j;
    printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2);
    for(j=0;j<FragmentLength[i];j++)
        printf("    %s\n",Step[ Fragment[i][j] ].Name);
    Done[i]=1;
    Reported++;
}

/*
    Hacked sentinel indicated a useless node (already been exhaustively 
    searched).

    Terje Mathisen modified a version of this code to remove the need for
    this hack by simply testing the A, B values for any out of range result.
    It confirms that using a single out of range value, as I have done is 
    also sufficient.  I did not merge with Terje's code, since it did not 
    substantially improve functionality or performance.
*/

#define USELESS_PATH -13666

/*
    Counts the potential improvments as they happen.  Used for pruning
    search paths which offer nothing else for future search iterations.
*/

static int PotentialImprovements=0;

/*
    The main search routine.  It starts with obvious exit conditions (if the
    current path is obsolete or if this path is slower than some previously
    searched path.)  It then updates the best cost if there has been an 
    improvement.  If the cost exceeds the current cost iteration, it is marked
    as a potential for a later deeper search and aborted.  If the path length
    is too long (this has never happened) it also exits.

    From here the main loop is entered and continued by adding each possible
    instruction one at a time and calling itself recursively.  If after the
    search no potential improvments for later search depths are found, then
    the search is essentially exhaustive, and therefore there is nothing more 
    to be gained in searching it again in future interations.  This pruning
    step improved the search performance by about 17% in tests I ran.
*/

void SearchInstructionSpace( int A, int B, PipeStateStructType *ps, int d, int cmax ) {
PipeStateStructType p;
int i, picount;
ClockCountType cost;
short aa,bb;

    if( B==Invalid ) B = Limit+1;

    if( SpaceCost[A][B]==USELESS_PATH ) return;
    picount = PotentialImprovements;

    cost = (ps->clocks);
    cost = cost * 2;
    if( ps->freepipe==UFREE ) cost = cost+1;    // Penalty for occupied V slot.

    if( d!=0 ) {
        if( SpaceCost[A][B] < cost ) return;	// Slower solution?
	if( SpaceCost[A][B] > cost ) {
            SpaceCost[A][B] = cost;
            if( cost<Best[A] ) {
        	Best[A]=cost;
        	FragmentLength[A]=d;
		i=0;
                do {
        	    Fragment[A][i] = Path[i];
		    i++;
		} while(i<d);
	    }
	}
    }
    if( B==Limit+1 ) B = Invalid;

    if( cost>cmax ) {
	PotentialImprovements++;
	return;
    }

    if( d>=MaxPathLength ) return;

    i=0;
    do {
        Path[d]=i;
        p = *AddInstruction(ps,i);
        aa=A;
        bb=B;
        if( 1==ApplyInst( &aa, &bb, i ) )
            SearchInstructionSpace(aa,bb,&p,d+1,cmax);
	i++;
    } while(Step[i].cost>0);

    if( picount==PotentialImprovements ) {
	if( B==Invalid ) {
	    B = Limit+1;
	    if( A==1 ) return;
	}
	SpaceCost[A][B]=USELESS_PATH;	// Prune node, nothing more to search.
    }
}

/*
    Reports on searches after each iteration. Just search for filled entries in
    the Best[] array that have not been already marked in the Done[] array.
*/

void GetReports(int c) {
int i,j;
    for(i=0;i<=Limit;i++) {
        if( Done[i]==0 && Best[i]<=c ) {
            printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2);
            for(j=0;j<FragmentLength[i];j++)
                printf("    %s\n",Step[ Fragment[i][j] ].Name);
            Done[i]=1;
            Reported++;
        }
    }
}

void main(void) {
PipeStateStructType p,q;
int i=0;
int t=0;

    InitSpaceState();

    InitPSS(&p);

/* 
    Iterate through a granularity of possible increasing costs up to but not
    including the last lexical entry (which is taken care of via the search
    itself.)  This assumes the cost never decreases as instructions are added
    which, of course, is true.
*/

    for(i=0;Reported<=Limit && i<MaxPathLength*2;i++) {
        SearchInstructionSpace( 1, Invalid, &p, 0, i );
        GetReports(i);
    }
}